# Loading data
## Books
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Books/")
stats.books <- read.csv(file="dataset_stats_books.tsv",head=T,sep="\t")
books.nodes <- read.csv(file="books_nodes.tsv",head=T,sep="\t")
books.edges <- read.csv(file="books_edges.tsv",head=T,sep="\t")

## Wikipedia
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Wiki/")
stats.wiki <- read.csv(file="dataset_stats_wikipedia.tsv",head=T,sep="\t")
wiki.nodes <- read.csv(file="wikipedia_nodes.tsv",head=T,sep="\t")
wiki.edges <- read.csv(file="wikipedia_edges.tsv",head=T,sep="\t")

## Twitter
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Twitter/")
stats.twitter <- read.csv(file="dataset_stats_twitter.tsv",head=T,sep="\t")
twitter.nodes <- read.csv(file="twitter_gln_nodes.tsv",head=T,sep="\t")
twitter.edges <- read.csv(file="twitter_gln_links_with_over_expression.tsv",head=T,sep="\t")

## Utility
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Utility/")
gdp.country <- read.csv(file="gdp_pc_population_by_country.tsv",head=T,sep="\t")
gdp.language <- read.csv(file="gdp_pc_population_by_language.tsv",head=T,sep="\t")
language.conversion <- read.csv(file="language_conversion_table_iso639-3.tsv",head=T,sep="\t")
language.speakers <- read.csv(file="language_speakers_families_iso639-3.tsv",head=T,sep="\t")

## Famous
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Famous individuals/")
famous.murr.country <- read.csv(file="Famous_murray_by_country.tsv",head=T,sep="\t")
famous.murr.language <- read.csv(file="Famous_murray_by_language.tsv",head=T,sep="\t")
famous.wiki.country <- read.csv(file="Famous_wikipedia_by_country.tsv",head=T,sep="\t")
famous.wiki.language <- read.csv(file="Famous_wikipedia_by_language.tsv",head=T,sep="\t")

## Analysis
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Analysis/")
Analysis <- read.csv(file="centralities_by_language.tsv",head=T,sep="\t")
Analysis$language.name <- as.character(Analysis$language.name)
Analysis$language.name <- gsub(" \\(macrolanguage\\)", "", Analysis$language.name)

## Country names and codes
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Countries/")
country <- read.csv(file="count_name_code.csv",head=T,sep=",")
country$Name <- as.character(country$Name)
country$Name[country$Name=="Russian Federation"] <- "Russia"

#########################################################################################################
# Replicating the figures

library(plyr)
library(tidyr)
library(ggplot2)

setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Replication")

# Fig.2
logPlotA<-function(){
  name <- "Figure 2 Plot A"
  mrga <- merge(x=stats.books,y=stats.wiki,by.x = "Code", by.y="Code")
  x <- mrga$TranslationsFrom
  y <- mrga$Editors
  pdf(file="2A.pdf", width = 12, height = 6)
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Book Translations (From)",
    ylab="Wikipedia Editors",
    cex=0.3,
    main=name
  )
  text(x,y,labels=mrga$Code)
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  
  dev.off()
}

logPlotA()

logPlotB<-function(){
  name <- "Figure 2 Plot B"
  mrga <- merge(x=stats.books,y=stats.twitter,by.x = "Code", by.y="Code")
  x <- mrga$TranslationsFrom
  y <- mrga$Users
  pdf(file="2B.pdf", width = 12, height = 6)
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Book Translations (From)",
    ylab="Twitter Users",
    cex=0.3,
    main=name
  )
  text(x,y,labels=mrga$Code)
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  
  dev.off()
}

logPlotB()

logPlotC<-function(){
  name <- "Figure 2 Plot C"
  mrga <- merge(x=stats.wiki,y=stats.twitter,by.x = "Code", by.y="Code")
  x <- mrga$Editors
  y <- mrga$Users
  pdf(file="2C.pdf", width = 12, height = 6)
  
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Wikipedia Editors",
    ylab="Twitter Users",
    cex=0.3,
    main=name
  )
  text(x,y,labels=mrga$Code)
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  dev.off()
}
logPlotC()

logPlotD<-function(){
  name <- "Figure 2 Plot D"
  mrga <- merge(x=books.edges,y=wiki.edges,by.x = c("SourceLanguageCode","TargetLanguageCode"), by.y=c("SourceLanguageCode","TargetLanguageCode"))
  x <- mrga$Coocurrences.x
  y <- mrga$Coocurrences.y
  pdf(file="2D.pdf", width = 12, height = 6)
  
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Book Translations",
    ylab="Common Wikipedia Editors",
    cex=0.3,
    main=name
  )
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  dev.off()
}
logPlotD()

logPlotE<-function(){
  name <- "Figure 2 Plot E"
  mrga <- merge(x=books.edges,y=twitter.edges,by.x = c("SourceLanguageCode","TargetLanguageCode"), by.y=c("Source","Target"))
  x <- mrga$Coocurrences
  y <- mrga$Common.Users
  pdf(file="2E.pdf", width = 12, height = 6)
  
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Book Translations",
    ylab="Common Twitter Users",
    cex=0.3,
    main=name
  )
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  dev.off()
}
logPlotE()

logPlotF<-function(){
  name <- "Figure 2 Plot F"
  mrga <- merge(x=wiki.edges,y=twitter.edges,by.x = c("SourceLanguageCode","TargetLanguageCode"), by.y=c("Source","Target"))
  x <- mrga$Coocurrences
  y <- mrga$Common.Users
  pdf(file="2F.pdf", width = 12, height = 6)
  
  plot(
    xy.coords(x,y),
    xlim=c(1,max(x)*10),
    ylim=c(1,max(y)*10),
    log="xy",
    xlab="Common Wikipedia Editors",
    ylab="Common Twitter Users",
    cex=0.3,
    main=name
  )
  adjr2 = round(summary(lm(y~x))$adj.r.squared,2)
  legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
  dev.off()
}
logPlotF()

#########################################################################################################
#Fig.3
## Preprocessing
IV <- merge(gdp.language,Analysis[,seq(2,5)],by.x="lang",by.y="language.code")
WikiFame <- merge(IV,famous.wiki.language[,seq(1,2)],by.x="lang",by.y="lang")
WikiFame <- na.omit(WikiFame)
WikiFame <- WikiFame[!WikiFame$books.eigenvector==0,]
WikiFame <- WikiFame[!WikiFame$exports_1800_1950<=1,]
MurrFame <- merge(IV,famous.murr.language[,seq(1,2)],by.x="lang",by.y="lang")
MurrFame <- na.omit(MurrFame)
MurrFame <- MurrFame[!MurrFame$books.eigenvector==0,]
MurrFame <- MurrFame[!MurrFame$exports_1800_1950<=1,]
plotdata <- merge(WikiFame,MurrFame[,c(1,7)],by.x="lang",by.y="lang",all=T)
plotdata2 <- plotdata %>% gather(colsource,EV.Cent,twitter.eigenvector:books.eigenvector)
plotdata2 <- plotdata2 %>% gather(rowsource,Famous,exports_1800_1950.x:exports_1800_1950.y)
plotdata3 <- plotdata2
plotdata3$EV.Cent <- log10(plotdata3$EV.Cent)
plotdata3$Famous <- log10(plotdata3$Famous)
plotdata3 <- na.omit(plotdata3)
plotdata3$rowsource = factor(plotdata3$rowsource,labels=c("log(Wikipedia 26+ famous people)","log(HA famous people)"))
plotdata3$colsource = factor(plotdata3$colsource,labels=c("log(Twitter Eigenvector Cent.)","log(Wikipedia Eigenvector Cent.)","log(Book Trans. Eigenvector Cent.)"))

names(plotdata3)[2] <- "GDP_per_Capita"
names(plotdata3)[3] <- "Number_of_Speakers"



adjR2p <- ddply(plotdata3, c("rowsource","colsource"), summarize, adjr2 = round(summary(lm(Famous~EV.Cent))$adj.r.squared,3), p= 0.001)

pdf(file="Fig3.pdf", width = 12, height = 6)

ggplot(plotdata3,aes(EV.Cent, Famous))+
  geom_point(aes(size=Number_of_Speakers,color = GDP_per_Capita))+scale_size_area(max_size = 18)+
  geom_text(aes(label=lang), size = 5)+
   geom_smooth(method = "lm", se = FALSE)+
  geom_text(data=adjR2p,aes(label=(paste("R2=", adjr2, "\n p<", p))), x=-5, y=3,size=5)+
   facet_grid(rowsource~colsource)+
   xlab("")+
   ylab("")+
   ylim(-0.2,3.7)+
    xlim(-6.2,0.3)+ 
   theme_bw()+
   scale_color_distiller(type='seq', palette='Reds',guide='colourbar',direction=1)+
   guides(fill = guide_legend(reverse = TRUE))+
   guides(color = guide_legend(reverse = TRUE))+
   theme(text=element_text(size=18),legend.position="top",aspect.ratio=1/1,
         panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
         legend.text=element_text(size=20),strip.background = element_rect(color="black",fill="white"))

dev.off()

#########################################################################################################
#Fig.4
#Fig.4 (A)
# Preprocessing
IV <- merge(gdp.language,Analysis[,seq(2,5)],by.x="lang",by.y="language.code")
WikiFame <- merge(IV,famous.wiki.language[,seq(1,2)],by.x="lang",by.y="lang")
WikiFame <- na.omit(WikiFame)
WikiFame <- WikiFame[!WikiFame$books.eigenvector==0,]
WikiFame <- WikiFame[!WikiFame$exports_1800_1950<=1,]

# Analysis
## Inspection
### First plot the distribution of dependent variable.
hist(WikiFame$exports_1800_1950,100)
### does not seem normal. Maybe lognormal.
hist(log10(WikiFame$exports_1800_1950),100)

# Regress
## Do for the 7 models.
l1 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc), data=WikiFame)
l2 <- lm(formula = log10(exports_1800_1950) ~ log10(twitter.eigenvector), data=WikiFame)
l3 <- lm(formula = log10(exports_1800_1950) ~ log10(wikipedia.eigenvector), data=WikiFame)
l4 <- lm(formula = log10(exports_1800_1950) ~ log10(books.eigenvector), data=WikiFame)
l5 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(twitter.eigenvector), data=WikiFame)
l6 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(wikipedia.eigenvector), data=WikiFame)
l7 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(books.eigenvector), data=WikiFame)

## Checking the effect of Twitter, Wikipedia, and Book Trans. EG Centrality between the two models (with and without).
anova(l1,l6)
anova(l1,l7)

# Output into a table


library(stargazer)
## This will produce an html file.
stargazer(l1, l2, l3, l4, l5, l6, l7, title="", font.size = "footnotesize",           
          align=TRUE, type="html", out="Fig.4.A.html",
          covariate.labels = c("log<sub>10</sub>(Population)","log<sub>10</sub>(GDP per capita)", "EV centrality [Twitter]",
                               "EV centrality [Wikipedia]","EV centrality [book trans.]", "(intercept)"),
          dep.var.labels.include = FALSE, omit.stat = c("ser", "f"),
          dep.var.caption = "Number of famous people born 1800-1950 per language, <br> based on having biographies in at least 26 Wikipedia language editions",
          notes = "***,**,* significant at 0.1%, 1% and 5% levels, respectively. Standard errors in parentheses. <br> Only languages with at least one famous person are included.",
          notes.label = "", notes.append = FALSE, notes.align = "l")

#Fig.4 (B)
fam.wiki.count <- merge(famous.wiki.country[,1:2],country,by.x="country_code",by.y="Code")
fam.wiki.count <- fam.wiki.count[with(fam.wiki.count,order(-exports_1800_1950)),]
stargazer(fam.wiki.count[1:10,c("Name","exports_1800_1950")], summary=FALSE, digits=1,
          type="html", out="Fig.4.B.html", colnames = FALSE,rownames = FALSE,digit.separate=0,
          title ="Famous people by country")

#Fig.4 (C)
fam.wiki.lang <- merge(famous.wiki.language[,1:2],Analysis[,1:2],by.x="lang",by.y="language.code")
fam.wiki.lang <- fam.wiki.lang[with(fam.wiki.lang,order(-exports_1800_1950)),]
stargazer(fam.wiki.lang[1:10,c("language.name","exports_1800_1950")], summary=FALSE, digits=1,
          type="html", out="Fig.4.C.html", colnames = FALSE,rownames = FALSE,
          title ="Famous people by language")

#Fig.4 (D)
Twitter.EV <- Analysis[with(Analysis,order(-twitter.eigenvector)),c("language.name","twitter.eigenvector")]
stargazer(Twitter.EV[1:7,c("language.name","twitter.eigenvector")], summary=FALSE, digits=2,
          type="html", out="Fig.4.D.html", colnames = FALSE,rownames = FALSE,
          title ="Twitter EV Cent.")

#Fig.4 (E)
Wiki.EV <- Analysis[with(Analysis,order(-wikipedia.eigenvector)),c("language.name","wikipedia.eigenvector")]
stargazer(Wiki.EV[1:7,c("language.name","wikipedia.eigenvector")], summary=FALSE, digits=2,
          type="html", out="Fig.4.E.html", colnames = FALSE,rownames = FALSE,
          title ="Wikipedia EV Cent.")

#Fig.4 (F)
Books.EV <- Analysis[with(Analysis,order(-books.eigenvector)),c("language.name","books.eigenvector")]
stargazer(Books.EV[1:7,c("language.name","books.eigenvector")], summary=FALSE, digits=2,
          type="html", out="Fig.4.F.html", colnames = FALSE,rownames = FALSE,
          title ="Book translation EV Cent.")

# Checking for the soundness of the models
N=nrow(WikiFame)
fit <- l1
bread <-vcov(fit)
est.fun <- estfun(fit)
meat <- t(est.fun)%*%est.fun
robust <- sandwich(fit, meat=crossprod(est.fun)/N)
robustse <- sqrt(diag(robust))
robustse

# Checking for the plausibility of the underlying assumptions
library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)

#########################################################################################################
#Fig.5
#Fig.5 (A)
# Preprocessing
IV <- merge(gdp.language,Analysis[,seq(2,5)],by.x="lang",by.y="language.code")
MurrFame <- merge(IV,famous.murr.language[,seq(1,2)],by.x="lang",by.y="lang")
MurrFame <- na.omit(MurrFame)
MurrFame <- MurrFame[!MurrFame$books.eigenvector==0,]
MurrFame <- MurrFame[!MurrFame$exports_1800_1950<=1,]

# Analysis
## Inspection
### First plot the distribution of dependent variable.
hist(MurrFame$exports_1800_1950,100)
### does not seem normal. Maybe lognormal.
hist(log10(MurrFame$exports_1800_1950),100)

# Regress
## Do for the 7 models.
l1 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc), data=MurrFame)
l2 <- lm(formula = log10(exports_1800_1950) ~ log10(twitter.eigenvector), data=MurrFame)
l3 <- lm(formula = log10(exports_1800_1950) ~ log10(wikipedia.eigenvector), data=MurrFame)
l4 <- lm(formula = log10(exports_1800_1950) ~ log10(books.eigenvector), data=MurrFame)
l5 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(twitter.eigenvector), data=MurrFame)
l6 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(wikipedia.eigenvector), data=MurrFame)
l7 <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ log10(books.eigenvector), data=MurrFame)

# Output into a table

library(stargazer)
## This will produce an html file.
stargazer(l1, l2, l3, l4, l5, l6, l7, title="", font.size = "footnotesize",           
          align=TRUE, type="html", out="Fig.5.A.html",
          covariate.labels = c("log<sub>10</sub>(Population)","log<sub>10</sub>(GDP per capita)", "EV centrality [Twitter]",
                               "EV centrality [Wikipedia]","EV centrality [book trans.]", "(intercept)"),
          dep.var.labels.include = FALSE, omit.stat = c("ser", "f"),
          dep.var.caption = "Number of famous people born 1800-1950 per language, <br> based on inclusion in <i>Human Accomplishment</i>",
          notes = "***,**,* significant at 0.1%, 1% and 5% levels, respectively. Standard errors in parentheses. <br> Only languages with at least one famous person are included.",
          notes.label = "", notes.append = FALSE, notes.align = "l")

#Fig.5 (B)
fam.murr.count <- merge(famous.murr.country[,1:2],country,by.x="country_code",by.y="Code")
fam.murr.count$exports_1800_1950 <- as.integer(as.character(fam.murr.count$exports_1800_1950))
fam.murr.count <- fam.murr.count[with(fam.murr.count,order(-exports_1800_1950)),]
stargazer(fam.murr.count[1:10,c("Name","exports_1800_1950")], summary=FALSE, digits=1,
          type="html", out="Fig.5.B.html", colnames = FALSE,rownames = FALSE,digit.separate=0,
          title ="Famous people by country")

#Fig.5 (C)
fam.murr.lang <- merge(famous.murr.language[,1:2],Analysis[,1:2],by.x="lang",by.y="language.code")
fam.murr.lang <- fam.murr.lang[with(fam.murr.lang,order(-exports_1800_1950)),]
stargazer(fam.murr.lang[1:10,c("language.name","exports_1800_1950")], summary=FALSE, digits=1,
          type="html", out="Fig.5.C.html", colnames = FALSE,rownames = FALSE,
          title ="Famous people by language")

# Checking for the soundness of the models
N=nrow(MurrFame)
fit <- l7
bread <-vcov(fit)
est.fun <- estfun(fit)
meat <- t(est.fun)%*%est.fun
robust <- sandwich(fit, meat=crossprod(est.fun)/N)
robustse <- sqrt(diag(robust))
robustse

# Checking for the plausibility of the underlying assumptions
library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)

